*****************************
**# A. TSUSA Codes Under MFA
*****************************
*This comes from the concordance from the 1978 TSUSA, which lists the mapping between
*TSUSA codes and MFA categories. Pages 136-141
import excel "raw/mfa_78.xlsx", sheet("Sheet1") clear allstring

drop A C E G I K
drop if _n == 1

rename B tsusa
save "processing/mfa_78_temp", replace

foreach letter in D F H J M {
	keep `letter'
	drop if `letter' == ""
	rename `letter' tsusa
	append using  "processing/mfa_78_temp"
	save "processing/mfa_78_temp", replace
}

keep tsusa
drop if missing(tsusa)
replace tsusa = trim(tsusa)

replace tsusa = tsusa + "000" if length(tsusa) == 5
replace tsusa = tsusa + "00" if length(tsusa) == 6
replace tsusa = tsusa + "0" if length(tsusa) == 7
replace tsusa = substr(tsusa,1,8)
replace tsusa = subinstr(tsusa,".","",1)

preserve
keep if substr(tsusa,4,2) == "--"
gen tsusa_missing_5 = substr(tsusa,1,3) + substr(tsusa,-2,2)
save "processing/mfa_78_tsusa_missing_middle", replace
	* These are all codes of the form, e.g., 321.xx58
	* Looks like textile product codes
restore

drop if substr(tsusa,4,2) == "--"
assert length(tsusa)==7
duplicates drop
	* 1 duplciate obs
save "processing/mfa_78_tsusa", replace
	
***************************************************
**# B. Share of TSUSA Codes in SITC Covered by MFA
***************************************************
* Concord TSUSA7-SITC
use "raw/TSUSA7_SITC4_crosswalk.dta", clear
keep if year == 1978
rename tsusa7 tsusa
tostring tsusa, replace
gen tsusa_missing_5 = substr(tsusa,1,3) + substr(tsusa,-2,2)

merge 1:1 tsusa using "processing/mfa_78_tsusa"
	* m=1: tsusa codes in concordance but are not tsusa mfa codes (most tsusa codes, as expected)
	* m=2: 51 out of 1224 tsusa mfa codes not in concordance
drop if _m == 2
gen mfa_tsusa = _merge == 3
drop _merge
merge m:1 tsusa_missing_5 using "processing/mfa_78_tsusa_missing_middle.dta"
	* m=1: most tsusa7 codes don't have missing middle problem
	* m=2: 126 tsusa7 codes that have missing middle problem but not in our concorded data
drop if _m == 2
replace mfa_tsusa = 1 if _merge == 3
drop _m tsusa_missing_5
gen n = 1
collapse (sum) n mfa, by(sitc)
gen mfa_share = mfa/n
keep sitc mfa_share
label var mfa_share "Share of tsusa7 codes within sitc covered by mfa"

rename sitc sitc2 

***************************
**# C. Concord SITC-HS-SIC
***************************
mmerge sitc2 using "processing/hs_sitc_concordance.dta", type(1:n)
	* m=1: 64 sitc codes (out of 842) for which we have mfa share but not in hs->sic concordance
	* m=2: 300 sitc codes in concordance for which we do not have mfa share. 
drop if _m == 1
replace mfa_share = 0 if _merge == 2
replace mfa_share = mfa_share*hs_sitc_weight

collapse (sum) mfa_share, by(hs92)
	* unique on hs92 except for hs92=271000 before collapse
	* 6 obs of hs92=271000, each with .1667 weight, but all have mfa_share=0
	
merge 1:m hs92 using "processing/sic_hs_concordance.dta"
	* m=1: 19 unique hs codes in hs-level mfa data that not in hs-sic concordance
	* m=2: 30 unique hs codes in hs-sic concordance that not in hs-level mfa data
drop if _m == 1
replace mfa_share = 0 if _merge == 2
replace mfa_share = mfa_share*sic_hs_weight
	* weights add to 1 within sic. So taking weighted average of hs mfa shares to get sic mfa share
collapse (sum) mfa_share, by(sic)
drop if regexm(sic,"X") == 1
destring sic, replace
 
****************
**# D. Clean up
****************
label var mfa_share  "Sh of tsusa7 lines covered by mfa within sic, uses 1978 tsusa7->sitc concordance"
save "processing/mfa_sic_shares.dta", replace

